s6_220301_manuscript_figures

nlc

3/1/2022

Set up

Load libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(Seurat)
## Attaching SeuratObject
library(clusterProfiler)
## 
## clusterProfiler v4.4.4  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
## 
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
## 
## Attaching package: 'clusterProfiler'
## 
## The following object is masked from 'package:purrr':
## 
##     simplify
## 
## The following object is masked from 'package:stats':
## 
##     filter
library(org.Mm.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## 
## The following object is masked from 'package:clusterProfiler':
## 
##     rename
## 
## The following objects are masked from 'package:lubridate':
## 
##     second, second<-
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## 
## Attaching package: 'IRanges'
## 
## The following object is masked from 'package:clusterProfiler':
## 
##     slice
## 
## The following object is masked from 'package:lubridate':
## 
##     %within%
## 
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## The following object is masked from 'package:purrr':
## 
##     reduce
## 
## The following object is masked from 'package:grDevices':
## 
##     windows
## 
## 
## Attaching package: 'AnnotationDbi'
## 
## The following object is masked from 'package:clusterProfiler':
## 
##     select
## 
## The following object is masked from 'package:dplyr':
## 
##     select

set seed

set.seed(6)

load s3_celltypes.rds

so_merge <- readRDS('analysis_files/s4_so_merge.rds')

Create cluster / celltype dictionary

# Colors for clusters
color_dict <- so_merge@meta.data %>%
  dplyr::select(lineage, cluster_l2, celltype_full) %>%
  distinct() %>%
  arrange(lineage, cluster_l2)

color_dict$color <- scales::hue_pal()(nrow(color_dict))

color_dict <- color_dict %>%
  arrange(cluster_l2)

# Colors for lineage
lineage_color_dict <- so_merge@meta.data %>%
  dplyr::select(lineage) %>%
  distinct() %>%
  arrange(lineage)

lineage_color_dict$color <- scales::hue_pal()(nrow(lineage_color_dict))

lineage_cols <- lineage_color_dict$color
names(lineage_cols) <- lineage_color_dict$lineage

# Colors for species
species_cols <- RColorBrewer::brewer.pal(n = 3, name = 'Set1')[1:2]
names(species_cols) <- c('mouse', 'human')

Figure 6

Figure 6A - lineage & cluster umap

1200 x 600 pixels

p1 <- DimPlot(so_merge,
        group.by = 'lineage',
        label = FALSE,
        repel = TRUE,
        cols = lineage_color_dict$color)+
    coord_equal()+
  ggtitle('Lineage')+
  theme(plot.title = element_text(size = 25, face = 'bold'),
        legend.text = element_text(face = 'bold'))+
  guides(color = guide_legend(override.aes = list(size = 7)))

p2 <- DimPlot(so_merge,
        group.by = 'seurat_clusters',
        label = TRUE,
        label.size = 6,
        repel = FALSE,
        cols = color_dict$color)+
  coord_equal()+
  ggtitle('Unsupervised Clusters')+
  theme(plot.title = element_text(size = 25, face = 'bold'),
        legend.position = 'none')

p2+p1

Figure 6B - Tumor-celltype distribution

# Pull metadata and construct tumor<->cellfrac_type dictionary
so_merge_meta <- so_merge@meta.data

tumor_cellfrac <- so_merge_meta %>%
  dplyr::select(tumor, cellfrac_type) %>%
  distinct()

# compute lineage proportions & join with tumor<->cellfrac_type dictionary
props <- so_merge_meta %>%
  dplyr::select(lineage, tumor) %>%
  table() %>%
  prop.table(., margin = 2)

prop_gather <- as_tibble(props)

prop_gather <- full_join(x = prop_gather,
                         y = tumor_cellfrac,
                         by = 'tumor',
                         suffix = c('', ''))

f6b <- ggplot(prop_gather, aes(x = tumor, y = n, fill = lineage))+
  geom_col()+
  theme_bw()+
  facet_grid(~cellfrac_type, scales = 'free_x', space = 'free_x')+
  ylab('Fraction of tumor')+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggsave("fig6b.png", f6b, height = 3, width = 4.5, dpi = 500)

Figure 6C - Distribution of unsupervised clusters (cell types)

1080 x 540

so_meta <- so_merge@meta.data

so_meta_tumors <- so_meta %>%
  dplyr::select(celltype_full, cellfrac_type, lineage, tumor) %>%
  group_by(tumor, cellfrac_type, celltype_full, lineage) %>%
  summarize(n_cells = n()) %>%
  group_by(tumor) %>%
  mutate(freq = n_cells/sum(n_cells))
## `summarise()` has grouped output by 'tumor', 'cellfrac_type', 'celltype_full'.
## You can override using the `.groups` argument.
so_meta_tumors_grouped <- so_meta_tumors %>%
  group_by(cellfrac_type, celltype_full) %>%
  mutate(mean_freq = mean(freq)) %>%
  mutate(freq_error_ymin = mean(freq) - sd(freq)) %>%
  mutate(freq_error_ymax = mean(freq) + sd(freq))

so_meta_tumors_summary <- so_meta_tumors %>%
  group_by(cellfrac_type, celltype_full) %>%
  summarize(tumor_count = n(),
            mean_freq = mean(freq),
            freq_sd_ymin = mean(freq) - sd(freq),
            freq_sd_ymax = mean(freq) + sd(freq),
            freq_sem_ymin = mean(freq) - sd(freq)/(tumor_count)^0.5,
            freq_sem_ymax = mean(freq) + sd(freq)/(tumor_count)^0.5,
            lineage = lineage)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## `summarise()` has grouped output by 'cellfrac_type', 'celltype_full'. You can
## override using the `.groups` argument.
# mean freq of tumor
ggplot(so_meta_tumors_summary, aes(y = mean_freq, x = celltype_full, fill = cellfrac_type))+
  geom_col(position = 'dodge')+
  geom_point(data = so_meta_tumors, aes(x = celltype_full, y = freq), color = 'gray50', size = 1.5)+
  theme_bw()+
  geom_errorbar(aes(x = celltype_full, ymin = freq_sem_ymin, ymax = freq_sem_ymax), position = 'dodge', width = 0.5)+
  facet_grid(rows = vars(cellfrac_type), cols = vars(lineage), scales = 'free_x', space = 'free_x')+
  theme(strip.text.x = element_text(angle = 90))+
  ylab('Mean frequency \n (fraction of tumor)')+
  xlab('Celltype \n (cluster-celltype)')+
  RotatedAxis()

Fig 6D: UMAP by lineage

## epi
so_epi <- subset(so_merge, subset = lineage == 'epithelial')
so_epi <- RunUMAP(so_epi, reduction = 'iNMF', dims = 1:50)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 17:17:01 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:17:01 Read 2824 rows and found 50 numeric columns
## 17:17:01 Using Annoy for neighbor search, n_neighbors = 30
## 17:17:01 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:17:01 Writing NN index file to temp file C:\Users\calistri\AppData\Local\Temp\1\RtmpE9uzsI\file7a0c17bb2fa9
## 17:17:01 Searching Annoy index using 1 thread, search_k = 3000
## 17:17:02 Annoy recall = 100%
## 17:17:02 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:17:03 Initializing from normalized Laplacian + noise (using irlba)
## 17:17:03 Commencing optimization for 500 epochs, with 121422 positive edges
## 17:17:11 Optimization finished
DimPlot(so_epi,
        group.by = 'celltype_full',
        label = TRUE,
        label.size = 5)+
  coord_equal()+
  ggtitle('Epithelial')

VlnPlot(so_epi,
        features = c('Myc', 'Pten'))

## fibro
so_fibro <- subset(so_merge, subset = lineage == 'fibroblast')
so_fibro <- RunUMAP(so_fibro, reduction = 'iNMF', dims = 1:50)
## 17:17:13 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:17:13 Read 1533 rows and found 50 numeric columns
## 17:17:13 Using Annoy for neighbor search, n_neighbors = 30
## 17:17:13 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:17:13 Writing NN index file to temp file C:\Users\calistri\AppData\Local\Temp\1\RtmpE9uzsI\file7a0c32323dd9
## 17:17:13 Searching Annoy index using 1 thread, search_k = 3000
## 17:17:13 Annoy recall = 100%
## 17:17:14 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:17:15 Initializing from normalized Laplacian + noise (using irlba)
## 17:17:15 Commencing optimization for 500 epochs, with 61640 positive edges
## 17:17:19 Optimization finished
DimPlot(so_fibro,
        group.by = 'celltype_full',
        label = TRUE,
        label.size = 5)+
  coord_equal()+
  ggtitle('Fibroblast')

## lym
so_lym <- subset(so_merge, subset = lineage == 'lymphoid')
so_lym <- RunUMAP(so_lym, reduction = 'iNMF', dims = 1:50)
## 17:17:21 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:17:21 Read 3414 rows and found 50 numeric columns
## 17:17:21 Using Annoy for neighbor search, n_neighbors = 30
## 17:17:21 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:17:22 Writing NN index file to temp file C:\Users\calistri\AppData\Local\Temp\1\RtmpE9uzsI\file7a0c59355c01
## 17:17:22 Searching Annoy index using 1 thread, search_k = 3000
## 17:17:23 Annoy recall = 100%
## 17:17:23 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:17:24 Initializing from normalized Laplacian + noise (using irlba)
## 17:17:24 Commencing optimization for 500 epochs, with 163904 positive edges
## 17:17:34 Optimization finished
DimPlot(so_lym,
        group.by = 'celltype_full',
        label = TRUE,
        label.size = 5)+
  coord_equal()+
  ggtitle('Lymphoid')

## mye
so_mye <- subset(so_merge, subset = lineage == 'myeloid')
so_mye <- RunUMAP(so_mye, reduction = 'iNMF', dims = 1:50)
## 17:17:36 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:17:36 Read 5706 rows and found 50 numeric columns
## 17:17:36 Using Annoy for neighbor search, n_neighbors = 30
## 17:17:36 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:17:37 Writing NN index file to temp file C:\Users\calistri\AppData\Local\Temp\1\RtmpE9uzsI\file7a0c246b422
## 17:17:38 Searching Annoy index using 1 thread, search_k = 3000
## 17:17:39 Annoy recall = 100%
## 17:17:40 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:17:41 Initializing from normalized Laplacian + noise (using irlba)
## 17:17:41 Commencing optimization for 500 epochs, with 269200 positive edges
## 17:17:56 Optimization finished
DimPlot(so_mye,
        group.by = 'celltype_full',
        label = TRUE,
        label.size = 5,
        repel = TRUE)+
  coord_equal()+
  ggtitle('Myeloid')

Figure 6E+6F: Epithelial heatmap + ego

720 x 720 heatmap

540 x 540 emapplot

library(enrichplot)


so_epi <- subset(so_merge, subset = lineage == 'epithelial')

epi_markers <- read_csv('analysis_files/s4_epithelial_markers.csv')
## Rows: 5181 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gene
## dbl (6): p_val, avg_log2FC, pct.1, pct.2, p_val_adj, cluster
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
top_epi_markers <- epi_markers %>%
  group_by(cluster) %>%
  filter(p_val_adj <= 0.01) %>%
  arrange(desc(avg_log2FC)) %>%
  slice_head(n = 5)

DoHeatmap(so_epi,
          features = top_epi_markers$gene)+
  ggtitle('Top epithelial genes')

DotPlot(so_epi,
        features = top_epi_markers$gene,
        group.by = 'celltype_full',
        cols = 'RdYlBu')+
  RotatedAxis()

epi_plots <- list()

for(i in unique(epi_markers$cluster)){

  # Identify current DEG
 curr_markers <- epi_markers %>%
    filter(cluster == i) %>%
  mutate(significantly_upregulated = avg_log2FC > 0.5 & p_val_adj <= 0.01)
 
 # extract significant DEG
 top_curr <- curr_markers %>%
  filter(avg_log2FC >= .5 & p_val_adj <= 0.01)
 

  ego_curr <- enrichGO(top_curr$gene,
                  OrgDb = org.Mm.eg.db,
                  keyType = 'SYMBOL',
                  ont = 'ALL')
  
  ego_curr <- pairwise_termsim(ego_curr)
  
  
  p1 <- ggplot(curr_markers, aes(x = avg_log2FC, y = -log10(p_val_adj), label = gene, color = significantly_upregulated))+
    geom_point()+
    ggrepel::geom_text_repel()+
    theme_bw()+
    ylab('Significance \n [ -log10(p_val_adj) ]')+
    xlab('Average log2 Fold Change')+
    ggtitle(paste0(i, ' vs all other epithelial'))+
    theme(legend.position = 'bottom')+
    labs(color = 'Significantly Upregulated')+
    scale_color_manual(values = c('black', 'red'))
  
  p2 <- treeplot(ego_curr)
  p3 <- dotplot(ego_curr)
  p4 <- emapplot(ego_curr,
         cex_label_category = 0.5,
         repel = TRUE,
         group_category = FALSE)+
    ggtitle(paste0('Cluster ', i, ' top ontologies'))
  
  # display plots in loop
  print(p1)
  print(p2)
  print(p3)
  print(p4)
  
  # stash plots for access later
  epi_plots[[as.character(i)]]$volcano <- p1
  epi_plots[[as.character(i)]]$tree <- p2
  epi_plots[[as.character(i)]]$dot <- p3
  epi_plots[[as.character(i)]]$emap <- p4
  
}
## Warning: ggrepel: 1044 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: ggrepel: 933 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: ggrepel: 1489 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: ggrepel: 413 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: ggrepel: 1161 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Figure 6G+6H: Heatmap of Fibroblast markers + Enrichment plot

dotplot: 600 x 480

so_fibro <- subset(so_merge, subset = lineage == 'fibroblast')

fibro_markers <- read_csv('analysis_files/s4_fibro_markers.csv')
## Rows: 2935 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gene
## dbl (6): p_val, avg_log2FC, pct.1, pct.2, p_val_adj, cluster
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
top_fibro_markers <- fibro_markers %>%
  group_by(cluster) %>%
  filter(p_val_adj <= 0.01) %>%
  arrange(desc(avg_log2FC)) %>%
  slice_head(n = 10)

DoHeatmap(so_fibro,
          features = top_fibro_markers$gene)+
  ggtitle('Top fibroblast genes')

fibro_plots <- list()

for(i in unique(fibro_markers$cluster)){

  # Identify current DEG
 curr_markers <- fibro_markers %>%
    filter(cluster == i) %>%
  mutate(significantly_upregulated = avg_log2FC > 0.5 & p_val_adj <= 0.01)
 
 # extract significant DEG
 top_curr <- curr_markers %>%
  filter(avg_log2FC >= .5 & p_val_adj <= 0.01)
  
 # Enrichment of gene ontology using MSIGDB
  ego_curr <- enrichGO(top_curr$gene,
                  OrgDb = org.Mm.eg.db,
                  keyType = 'SYMBOL',
                  ont = 'ALL')
  
  ego_curr <- pairwise_termsim(ego_curr)
  
  # Generate plots
  p1 <- ggplot(curr_markers, aes(x = avg_log2FC, y = -log10(p_val_adj), label = gene, color = significantly_upregulated))+
    geom_point()+
    ggrepel::geom_text_repel()+
    theme_bw()+
    ylab('Significance \n [ -log10(p_val_adj) ]')+
    xlab('Average log2 Fold Change')+
    ggtitle(paste0(i, ' vs all other fibroblast'))+
    theme(legend.position = 'bottom')+
    labs(color = 'Significantly Upregulated')+
    scale_color_manual(values = c('black', 'red'))
  
  p2 <- treeplot(ego_curr)
  p3 <- dotplot(ego_curr)
  p4 <- emapplot(ego_curr,
         cex_label_category = 0.5,
         repel = TRUE,
         group_category = FALSE)+
    ggtitle(paste0('Cluster ', i, ' top ontologies'))
  
  # display plots in loop
  print(p1)
  print(p2)
  print(p3)
  print(p4)
  
  # stash plots for access later
  fibro_plots[[as.character(i)]]$volcano <- p1
  fibro_plots[[as.character(i)]]$tree <- p2
  fibro_plots[[as.character(i)]]$dot <- p3
  fibro_plots[[as.character(i)]]$emap <- p4
  
}
## Warning: ggrepel: 677 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: ggrepel: 9 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: ggrepel: 1175 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: ggrepel: 986 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Figure 7:

Fig 7B: ScPred classification of murine clusters

900 x 800

so_merge_scpred <- readRDS('analysis_files/s5_so_merge_subset_mda_scpred.rds')

freq <- so_merge_scpred@meta.data %>%
  dplyr::select(celltype_full, scpred_prediction) %>%
  table()

row.names(freq) <- paste0('MM: ', row.names(freq))
colnames(freq) <- paste0('HS: ', colnames(freq))

row_anno <- so_merge_scpred@meta.data %>%
  dplyr::select(lineage, celltype_full) %>%
  distinct() %>%
  mutate(celltype_full = paste0('MM: ', celltype_full))

row_anno <- data.frame(lineage = row_anno$lineage,
                       row.names = row_anno$celltype_full)

pheatmap::pheatmap(prop.table(freq, margin = 1),
                   annotation_row = row_anno,
                   main = 'scPred assignment \n Row fraction',
                   color = colorRampPalette(c('white', 'black'))(50),
                   annotation_colors = list(lineage = lineage_cols))

# ARI of matrix without epithelial cells
scpred_stroma <- so_merge_scpred@meta.data %>%
  filter(lineage != 'epithelial')

aricode::ARI(c1 = scpred_stroma$celltype_full,
             c2 = scpred_stroma$scpred_prediction)
## [1] 0.4082607
# ARI of only epithelial cells
scpred_epi <- so_merge_scpred@meta.data %>%
  filter(lineage == 'epithelial')

aricode::ARI(c1 = scpred_epi$celltype_full,
             c2 = scpred_epi$scpred_prediction)
## [1] 0.09099002
# Remove so to reclaim memory
rm(so_merge_scpred)

Figure 7c: Integrated data iNMF score vs celltype/cluster

Load integrated data and prepare metadata

so_rliger <- readRDS('analysis_files/s5_xspecies_uinmf_so.rds')

so_rliger@meta.data$species <- str_split(rownames(so_rliger@meta.data), pattern = '_', simplify = TRUE)[,1]

celltype_major_l1_dict <- rbind(c('Endothelial', 'endothelial'),
                                c('CAFs', 'fibroblast'),
                                c('PVL', 'perivascular'),
                                c('B-cells', 'lymphoid'),
                                c('T-cells', 'lymphoid'),
                                c('Myeloid', 'myeloid'),
                                c('Normal Epithelial', 'epithelial'),
                                c('Plasmablasts', 'lymphoid'),
                                c('Cancer Epithelial', 'epithelial'))

# Convert celltype_major to celltype_l1

so_rliger@meta.data$celltype_major_to_l1[so_rliger@meta.data$species == 'human'] <- plyr::mapvalues(so_rliger@meta.data$celltype_major[so_rliger@meta.data$species == 'human'],
                                                     from = celltype_major_l1_dict[,1],
                                                     to = celltype_major_l1_dict[,2])

# Combine celltype_l1 from each species
so_rliger@meta.data <- tidyr::unite(data = so_rliger@meta.data,
                                    col = 'celltype_l1.5',
                                    celltype_major_to_l1,
                                    lineage,
                                    na.rm = TRUE,
                                    sep = '',
                                    remove = FALSE)

# Combine celltype_l2 and celltype_minor
so_rliger@meta.data <- tidyr::unite(data = so_rliger@meta.data,
                                    col = 'celltype_l2.5',
                                    celltype_full,
                                    celltype_minor,
                                    na.rm = TRUE,
                                    sep = '',
                                    remove = FALSE)

# Combine human 'subtype' and mouse 'cellfrac_type' 
so_rliger@meta.data <- tidyr::unite(data = so_rliger@meta.data,
                                    col = 'tumor_subtype',
                                    subtype,
                                    cellfrac_type,
                                    na.rm = TRUE,
                                    sep = '',
                                    remove = FALSE)

# Combine human 'Patient' and mouse 'cellfrac_type' 
so_rliger@meta.data <- tidyr::unite(data = so_rliger@meta.data,
                                    col = 'sample_id',
                                    Patient,
                                    tumor,
                                    na.rm = TRUE,
                                    sep = '',
                                    remove = FALSE)

# Add species label to celltype_l2.5

so_rliger@meta.data$species_celltype <- paste0(recode(so_rliger@meta.data$species,
                                                      'mouse' = 'MM: ',
                                                      'human' = 'HS: '),
                                               so_rliger@meta.data$celltype_l2.5)


species_celltype_dict <- so_rliger@meta.data %>%
  dplyr::select(species, celltype_l2.5) %>%
  distinct()

1200 x 1100

# Average iNMF embedding to show relationship to cluster, annotated by celltype_l1

cluster_embedding <- tibble(seurat_clusters = so_rliger@meta.data$species_celltype) %>%
  cbind(so_rliger@reductions$inmf@cell.embeddings) %>%
  group_by(seurat_clusters) %>%
  summarize(across(everything(), list(mean)))

row_anno_tibble <- so_rliger@meta.data %>%
  dplyr::select(species_celltype, celltype_l1.5, species) %>%
  distinct()

row_anno <- data.frame(species = row_anno_tibble$species,
                       lineage = row_anno_tibble$celltype_l1.5,
                       row.names = row_anno_tibble$species_celltype)


pheatmap::pheatmap(t(scale(t(data.frame(cluster_embedding[,2:ncol(cluster_embedding)],
                              row.names = cluster_embedding$seurat_clusters)))),
                   scale = 'none',
                   main = 'Average UINMF embedding per cluster \n row z-scored',
                   annotation_row = row_anno,
                   annotation_colors = list(lineage = lineage_cols,
                                            species = species_cols))

Figure 7D: Original cluster/celltype vs integrated cluster

1200 x 1200

clust_lineage <- so_rliger@meta.data %>%
  dplyr::select(RNA_snn_res.0.6, celltype_l1.5) %>%
  table()

clust_lineage_prop <- prop.table(clust_lineage, margin = 1)

for(i in 1:nrow(clust_lineage_prop)){
  
  if(max(clust_lineage_prop[i,]) > 0.8){
    curr_lineage <- names(which.max(clust_lineage_prop[i,]))
  }else{
    curr_lineage <- 'mixed'
  }
  
  if(i == 1){
    cluster_lineage_dict <- tibble(cluster = row.names(clust_lineage_prop)[i],
                                   consensus_lineage = curr_lineage)
  }else{
    cluster_lineage_dict <- rbind(cluster_lineage_dict,
                                  tibble(cluster = row.names(clust_lineage_prop)[i],
                                         consensus_lineage = curr_lineage))
  }
}

consensus_lineage_cols <- RColorBrewer::brewer.pal(n = length(unique(cluster_lineage_dict$consensus_lineage)), 'Set1')
names(consensus_lineage_cols) <- unique(cluster_lineage_dict$consensus_lineage)

so_rliger_meta <- so_rliger@meta.data

freq_table <- so_rliger_meta %>%
  dplyr::select(species_celltype, RNA_snn_res.0.6) %>%
  table()


row_anno <- so_rliger_meta %>% 
  dplyr::select(species_celltype, species, celltype_l1.5) %>%
  distinct()

row_anno <- data.frame(species = row_anno$species,
                       lineage = row_anno$celltype_l1.5,
                       row.names = row_anno$species_celltype)

row_anno$lineage <- factor(row_anno$lineage, levels = unique(cluster_lineage_dict$consensus_lineage))

col_anno <- data.frame(row.names = cluster_lineage_dict$cluster,
                       consensus_lineage = cluster_lineage_dict$consensus_lineage)

anno_colors <- list(lineage = lineage_cols,
                    consensus_lineage = consensus_lineage_cols,
                    species = species_cols)

col_order <- cluster_lineage_dict %>%
  arrange(consensus_lineage, cluster)

pheatmap::pheatmap(prop.table(freq_table, margin = 1),
                   annotation_row = row_anno,
                   annotation_col = col_anno,
                   annotation_colors = anno_colors,
                   color = colorRampPalette(colors = c('white', 'black'))(100),
                   main = 'Original identity versus integrated cluster \n (Row fraction)',
                   cluster_cols = TRUE)

Figure 7E: Integrated UMAP colored by lineage

DimPlot(so_rliger,
        group.by = 'celltype_l1.5',
        label = FALSE,
        repel = TRUE,
        cols = lineage_color_dict$color)+
    coord_equal()+
  ggtitle('Lineage')+
  theme(plot.title = element_text(size = 25, face = 'bold'),
        legend.text = element_text(face = 'bold'))+
  guides(color = guide_legend(override.aes = list(size = 7)))
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

Figure 7F: UMAP split by species

480 x 480 per umap

# Randomly subset to an equivalent number

species_count <- table(so_rliger@meta.data$species)

mouse_barcodes <- so_rliger@meta.data %>%
  mutate(cell_id = rownames(so_rliger@meta.data)) %>%
  filter(species == 'mouse') %>%
  dplyr::select(cell_id) %>%
  pull()

set.seed(6)
human_barcodes <- so_rliger@meta.data %>%
  mutate(cell_id = rownames(so_rliger@meta.data)) %>%
  filter(species == 'human') %>%
  mutate(random_n = sample(species_count[1])) %>%
  filter(random_n <= species_count[2]) %>%
  dplyr::select(cell_id) %>%
  pull()

so_rliger_subset <- subset(so_rliger, cells = c(mouse_barcodes, human_barcodes))

DimPlot(so_rliger_subset,
        group.by = 'RNA_snn_res.0.6',
        label = TRUE,
        label.size = 5,
        split.by = 'species')+
    coord_equal()+
    theme(legend.position = 'none')+
  ggtitle('Unsupervised clusters \n (subset to equal n)')

Supplementary Figure 13:

S13D: Integrated scRNAseq Lineage vs fraction of cluster

1200 x 300

clust_lineage <- so_rliger@meta.data %>%
  dplyr::select(RNA_snn_res.0.6, celltype_l1.5) %>%
  table()

clust_lineage_prop <- prop.table(clust_lineage, margin = 1)

pheatmap::pheatmap(t(clust_lineage_prop),
                   display_numbers = round(t(clust_lineage_prop),2),
                   main = 'Fraction of lineage per cluster',
                   color = colorRampPalette(colors = c('white', 'black'))(100))

for(i in 1:nrow(clust_lineage_prop)){
  
  if(max(clust_lineage_prop[i,]) > 0.8){
    curr_lineage <- names(which.max(clust_lineage_prop[i,]))
  }else{
    curr_lineage <- 'mixed'
  }
  
  if(i == 1){
    cluster_lineage_dict <- tibble(cluster = row.names(clust_lineage_prop)[i],
                                   consensus_lineage = curr_lineage)
  }else{
    cluster_lineage_dict <- rbind(cluster_lineage_dict,
                                  tibble(cluster = row.names(clust_lineage_prop)[i],
                                         consensus_lineage = curr_lineage))
  }
}

Patient breakdown

meta_human <- so_rliger@meta.data %>%
  filter(species == 'human') %>%
  mutate(og_label = celltype_minor) %>%
  mutate(sample_id = Patient)

meta_human$celltype_l1 <- plyr::mapvalues(x = meta_human$celltype_major,
                                          from = celltype_major_l1_dict[,1],
                                          to = celltype_major_l1_dict[,2])

meta_mouse <- so_rliger@meta.data %>%
  filter(species == 'mouse') %>%
  mutate(og_label = celltype_full) %>%
  mutate(sample_id = tumor)

meta <- rbind(meta_mouse, meta_human)

freq_table <- meta %>%
  dplyr::select(RNA_snn_res.0.9, sample_id) %>%
  table()

col_anno <- meta %>%
  dplyr::select(sample_id, subtype, cellfrac_type, species) %>%
  distinct()

col_anno <- data.frame(subtype = col_anno$subtype,
                       cellfrac_type = col_anno$cellfrac_type,
                       species = col_anno$species,
                       row.names = col_anno$sample_id)

pheatmap::pheatmap(prop.table(freq_table, margin = 2),
                   annotation_col = col_anno)

# Supplementary figure 13

S13A - UMAP comparison - no integration, harmony and iNMF

all: 500 x 500

# Harmony
so_harmony <- RunUMAP(so_merge, dims = 1:50, reduction = 'harmony')
## 17:24:23 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:24:23 Read 14042 rows and found 50 numeric columns
## 17:24:23 Using Annoy for neighbor search, n_neighbors = 30
## 17:24:23 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:24:25 Writing NN index file to temp file C:\Users\calistri\AppData\Local\Temp\1\RtmpE9uzsI\file7a0c39dc102e
## 17:24:25 Searching Annoy index using 1 thread, search_k = 3000
## 17:24:29 Annoy recall = 100%
## 17:24:30 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:24:31 Initializing from normalized Laplacian + noise (using irlba)
## 17:24:33 Commencing optimization for 200 epochs, with 660676 positive edges
## 17:24:47 Optimization finished
harmony_umap <- DimPlot(so_harmony,
                        group.by = 'tumor')

rm(so_harmony)

# No integration
so_noint <- RunPCA(so_merge)
## PC_ 1 
## Positive:  Cd74, Hmgb2, H2-Ab1, Wfdc17, Lyz2, Bcl2a1b, C1qa, C1qb, S100a9, S100a8 
##     C1qc, Cd3g, Ccrl2, Clec4d, Clec4e, Ms4a7, Trbc2, Il1rn, Nlrp3, Igkc 
##     Gatm, AW112010, Vps37b, Itgae, Ccl4, Irf8, Trem1, Cx3cr1, Ifitm1, Apoe 
## Negative:  Fstl1, Col3a1, Aebp1, Dcn, Serping1, Col1a2, Fbn1, Mmp2, Sparc, Col5a2 
##     Col1a1, Lum, Mfap5, Rarres2, Adamts5, C1s1, Fn1, Bgn, Serpina3n, Loxl1 
##     Serpinh1, Ccdc80, Col5a1, Tnfaip6, Col6a1, Ogn, Clec3b, Thbs2, Serpinf1, Cpxm1 
## PC_ 2 
## Positive:  Epcam, Fxyd3, Wfdc18, Krt8, Krt18, Lcn2, Plet1, Cldn7, Csn3, Nfib 
##     Ehf, Cldn3, Spint2, Dbi, Aqp5, Sox9, Igfbp5, Trf, Elf5, Folr1 
##     Tspan8, Smim22, Apoc1, Ano1, Phlda1, Mfge8, Ptprf, Me1, Rab25, Irx2 
## Negative:  Fth1, Cd74, Wfdc17, H2-Ab1, Lyz2, Apoe, Bcl2a1b, C1qa, C1qb, C1qc 
##     S100a9, Ifitm1, Clec4d, Ccrl2, Clec4e, Ms4a7, Gsn, Igfbp6, S100a8, Hmgb2 
##     Nlrp3, Cx3cr1, AW112010, Cxcl2, Il1r2, Il1rn, Ccl4, Slfn5, Trem1, Cd3g 
## PC_ 3 
## Positive:  Fth1, Cxcl2, Ier3, Basp1, Lcn2, S100a8, S100a9, C3, Clec4e, Clec4d 
##     G0s2, Wfdc18, Ccrl2, Nlrp3, Wfdc17, Il1rn, Cstb, Igfbp6, Mt1, Cd24a 
##     Il1r2, Mgst1, Pi16, Krt18, Trem1, Tnfaip6, Ifitm1, Plet1, Acod1, Hp 
## Negative:  Cdh5, Egfl7, Gpihbp1, Flt1, Emcn, Plvap, Aqp1, Esam, Pecam1, Ptprb 
##     Kdr, Igfbp3, Col4a2, Adgrf5, Col4a1, Adgrl4, Mmrn2, Sox18, Gng11, Cd93 
##     Eng, Ednrb, Cldn5, Sparcl1, Sema6d, Sema3f, Mcam, Ramp2, Ecscr, Podxl 
## PC_ 4 
## Positive:  Cd3g, AW112010, Trbc2, Rps2, Il2rb, Ctsw, Itgae, Hsp90ab1, Nkg7, Ndufa4 
##     Trac, Trbc1, Top2a, Ikzf2, Icos, Cd7, Pclaf, Xcl1, Dut, Txk 
##     Stmn1, Mki67, Eef1g, Smc2, Tcrg-C1, Gzmb, Prc1, Il2ra, Klrk1, Trdc 
## Negative:  S100a9, S100a8, Clec4d, Clec4e, Cxcl2, G0s2, Nlrp3, Ifitm1, Acod1, Ccrl2 
##     Trem1, Il1rn, Wfdc21, Hcar2, Ier3, Il1f9, Wfdc17, Il1r2, Lrg1, Cstdc4 
##     Upp1, Stfa2l1, Slfn4, Basp1, Ccl3, Retnlg, Ptgs2, Asprv1, Fth1, Hp 
## PC_ 5 
## Positive:  Postn, Matn4, Il17b, Krt5, Tpm2, Ecrg4, Col11a1, Slpi, Igfbp2, Col8a1 
##     Krt14, Col7a1, Col16a1, Acta2, Tagln, Col12a1, Col17a1, Bgn, Myl9, Hmcn1 
##     Grb14, Trp63, Gas1, Mgp, Lgr5, Moxd1, Tnc, Megf6, Pdgfrl, Samd5 
## Negative:  Ly6c1, Pcolce2, Cd34, Tnxb, Clec3b, Efemp1, Itih5, Opcml, Has1, Dpp4 
##     Cadm3, Pi16, Scara5, Sfrp4, Adgrd1, Efhd1, Gfpt2, Slc4a4, Uap1, Ackr3 
##     Heg1, Pla1a, Lrrn4cl, Ifi205, Pcsk6, Cd55, C4b, Tmem100, Tnfaip6, Gas7
so_noint <- RunUMAP(so_noint, dims = 1:50, reduction = 'pca')
## 17:24:55 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:24:55 Read 14042 rows and found 50 numeric columns
## 17:24:55 Using Annoy for neighbor search, n_neighbors = 30
## 17:24:55 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:24:57 Writing NN index file to temp file C:\Users\calistri\AppData\Local\Temp\1\RtmpE9uzsI\file7a0c42705ec2
## 17:24:57 Searching Annoy index using 1 thread, search_k = 3000
## 17:25:00 Annoy recall = 100%
## 17:25:01 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:25:03 Initializing from normalized Laplacian + noise (using irlba)
## 17:25:04 Commencing optimization for 200 epochs, with 627536 positive edges
## 17:25:19 Optimization finished
noint_umap <- DimPlot(so_noint,
                      group.by = 'tumor')+
  coord_equal()
rm(so_noint)

# iNMF
inmf_umap <- DimPlot(so_merge,
                     group.by = 'tumor')+
  coord_equal()


noint_umap+
  ggtitle('No integration \n 50 PCs')

harmony_umap+
  ggtitle('Harmony integrated \n 50 components')

inmf_umap+
  ggtitle('rLiger integrated \n 50 Factors')

S13B - clustering optimization

Created in s3_clustering_optimization.rmd

S13C - cluster vs lineage marker dotplot

1080x480 pixels

DotPlot(so_merge,
        features = c('Ptprc', 'Cd74', 'Epcam', 'Cd14', 'Cd3e', 'Pdgfra', 'Krt5', 'Pecam1', 'Mki67', 'Pdgfrb'),
        cols = 'RdYlBu',
        group.by = 'cluster_l2')+
  theme_bw()+
  coord_flip()

S13D/E - c14 subset: umap & cluster vs lineage marker dotplot

480 x 480 both

library(harmony)
## Loading required package: Rcpp
c14 <- subset(so_merge,
              subset = seurat_clusters == 14)

c14 <- FindVariableFeatures(c14)
c14 <- NormalizeData(c14)
c14 <- ScaleData(c14)
## Centering and scaling data matrix
c14 <- RunPCA(c14)
## PC_ 1 
## Positive:  Tmsb10, Trbc2, Sept1, Cd3g, Ablim1, Cd3d, Ptprcap, Trac, Skap1, S100a10 
##     Il2rb, Ctsw, Ctla2a, Pdcd4, Ltb, Lck, S100a6, Rora, Thy1, Nkg7 
##     Cd226, Cxcr6, Dut, Sh2d2a, Trbc1, Ikzf3, Il18r1, Itgae, Inpp4b, Ahnak 
## Negative:  C1qc, Ctss, C1qa, C1qb, Mpeg1, Cx3cr1, Apoe, Csf1r, Aif1, Lgmn 
##     Cd74, Ctsc, Ms4a7, Cxcl16, Cst3, Ctsz, Hexb, Gatm, Cybb, Tyrobp 
##     Tgfbi, Lyz2, Cd68, Ctsb, Pld4, Psap, Ftl1, H2-Ab1, Fcer1g, Man2b1 
## PC_ 2 
## Positive:  Lcn2, Wfdc18, Csn3, Epcam, Nfib, Fxyd3, Trf, Cldn7, Cldn3, Igfbp5 
##     Cald1, Krt8, Dbi, Krt18, Aqp5, Sox9, Plet1, Sox4, Phlda1, Nfix 
##     Nedd4, Scgb2b27, Auts2, Apoc1, Tpm1, Cd9, Fermt2, Chil1, Ptprf, Cd24a 
## Negative:  AW112010, Srgn, Ltb, Crip1, Ptprcap, Ctsw, Vim, Cd3g, Ablim1, Sept1 
##     Il2rb, Nkg7, S100a4, Trbc2, Gimap4, Skap1, Ifngr1, Rgs1, Itgae, S100a10 
##     Cd226, Inpp4b, Bcl2, Lck, Esyt1, H2-Q7, Ptpn22, Cd3d, Atad2, Gzmb 
## PC_ 3 
## Positive:  AY036118, Igkc, Maf, Il7r, Gm26917, Ccnb1ip1, Icos, Apoe, Cxcr6, Cd6 
##     Rora, Lst1, Cd74, Il2ra, Ms4a4b, Rmrp, Lgals1, Hbb-bs, Trac, Hba-a1 
##     Ly6a, Ccr2, Cd3d, Cd4, Il18r1, Hbb-bt, Ttc39c, H2-Aa, Ifi27l2a, Thy1 
## Negative:  Cemip2, Gzmb, Car2, Xcl1, Hsp90ab1, Rps2, Klra5, Ier5l, Ctla2b, Junb 
##     Rps18, Cdv3, Npm1, Hspd1, Fasl, Nme2, Nme1, Eif5a, Errfi1, Rpl12 
##     Sult2b1, Cebpb, Eef1b2, Nr4a2, Rps3, Eef1g, Klrk1, Itih5, Sfr1, Cited4 
## PC_ 4 
## Positive:  Lgals1, Cavin3, Maf, S100a4, S100a10, Rhobtb3, Tmem64, Serpinb1a, Palld, Cav1 
##     Trdc, Cd163l1, Il7r, Rora, Ptpn14, Col6a1, Ramp1, Capg, Ppic, Phlda3 
##     Fat1, Igfbp3, Trdv4, Cx3cl1, Tpm2, Timp1, Krt15, Cd3d, Fkbp9, Ikzf2 
## Negative:  Xcl1, Scgb2b27, Klrd1, Nr4a2, Scgb1b27, Cemip2, Gzmb, Ncr1, Serpinb9, Klra5 
##     Car6, Ctla2a, Etv1, Wfdc18, Gzma, Vps37b, Serpinb6b, Chn2, Errfi1, Aqp5 
##     Neurl3, Ppp1r3b, Cd7, Klre1, Nkg7, Hba-a1, Hbb-bs, Gzmc, Ctla2b, Lcn2 
## PC_ 5 
## Positive:  Krt15, Dsc3, Krt5, Krt14, Sparc, Fat1, Fkbp9, Vcan, Igsf10, Ppic 
##     Tagln, Slpi, Trp63, Cav1, Acta2, Postn, S100a16, Serpine2, A430105J06Rik, Igfbp3 
##     Hmcn1, Cxcl14, Phlda3, Sfrp1, Brinp3, Col4a1, Tpm2, Palld, Col6a1, Gas1 
## Negative:  Nme1, Nme2, Chchd10, Kcnk1, Rplp0, Rps3, Elf5, Atp2c2, 4931406C07Rik, Cldn7 
##     Cystm1, Kcnn4, Eef1b2, Ppp1r2, Wfdc18, Npm1, Prss8, Rpl12, 1110008P14Rik, Erh 
##     Nhp2, Ass1, Epcam, Rpsa, Smim22, Cd24a, Plet1, Rab25, Cldn3, Ehf
c14 <- RunHarmony(c14,
                        group.by.vars = 'library_id')
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony 5/10
## Harmony 6/10
## Harmony 7/10
## Harmony converged after 7 iterations
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.RNA.harmony; see ?make.names for more details
## on syntax validity
ElbowPlot(c14, reduction = 'harmony')

c14 <- RunUMAP(c14, reduction = 'harmony', dims = 1:5,
                  seed.use = 500)
## 17:25:29 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:25:29 Read 340 rows and found 5 numeric columns
## 17:25:29 Using Annoy for neighbor search, n_neighbors = 30
## 17:25:29 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:25:29 Writing NN index file to temp file C:\Users\calistri\AppData\Local\Temp\1\RtmpE9uzsI\file7a0c5d9cd8c
## 17:25:29 Searching Annoy index using 1 thread, search_k = 3000
## 17:25:29 Annoy recall = 100%
## 17:25:30 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:25:32 Initializing from normalized Laplacian + noise (using irlba)
## 17:25:32 Commencing optimization for 500 epochs, with 11592 positive edges
## 17:25:33 Optimization finished
DimPlot(c14,
        label = TRUE,
        label.size = 6)+
  ggtitle('Cluster 14 \n (5 Harmony components)')

DotPlot(c14,
        features = c('Ptprc', 'Cd74', 'Epcam', 'Cd14', 'Cd3e', 'Pdgfra', 'Krt5', 'Pecam1', 'Pdgfrb'),
        cols = 'RdYlBu',
        group.by = 'cluster_l2')+
  theme_bw()+
  coord_flip()
## Warning: Scaling data with a low number of groups may produce misleading
## results

S13F: UMAP by cellfrac_type

1080x540

# Find smallest number of cells per cellfrac type
n_per_cellfrac_type <- so_merge@meta.data %>%
  group_by(cellfrac_type) %>%
  summarize(n_cells = n())

n_smallest <- min(n_per_cellfrac_type$n_cells)

# Randomly subsample each cellfrac type

set.seed(7)

sp_barcodes <- so_merge@meta.data %>%
  mutate(cell_id = rownames(so_merge@meta.data)) %>%
  filter(cellfrac_type == 'SP') %>%
  mutate(random_n = sample(nrow(.))) %>%
  filter(random_n <= n_smallest) %>%
  dplyr::select(cell_id) %>%
  pull()

set.seed(8)

srip_barcodes <- so_merge@meta.data %>%
  mutate(cell_id = rownames(so_merge@meta.data)) %>%
  filter(cellfrac_type == 'SR_IP') %>%
  mutate(random_n = sample(nrow(.))) %>%
  filter(random_n <= n_smallest) %>%
  dplyr::select(cell_id) %>%
  pull()

set.seed(9)

srir_barcodes <- so_merge@meta.data %>%
  mutate(cell_id = rownames(so_merge@meta.data)) %>%
  filter(cellfrac_type == 'SR_IR') %>%
  mutate(random_n = sample(nrow(.))) %>%
  filter(random_n <= n_smallest) %>%
  dplyr::select(cell_id) %>%
  pull()

# Plot UMAP by cellfrac type

DimPlot(subset(so_merge, cells = c(sp_barcodes, srip_barcodes, srir_barcodes)),
        group.by = 'lineage',
        label = FALSE,
        split.by = 'cellfrac_type',
        pt.size = 1.5)+
    coord_equal()+
    theme(legend.position = 'bottom')+
  ggtitle('Lineage by cellfrac_type \n (subset to equal n)')+
  guides(color = guide_legend(nrow = 1, override.aes = list(size = 5)))

Supplemental 13G

960 x 480 pixels

so_merge_meta <- so_merge@meta.data %>%
  mutate(tumor_pheno = paste0(tumor, '-', histology))

freq_table <- so_merge_meta %>%
  dplyr::select(lineage, tumor_pheno) %>%
  table()

cellfrac_type_dict <- so_merge_meta %>%
  dplyr::select(cellfrac_type, tumor_pheno) %>%
  distinct()

col_anno <- tibble(histology = str_split(colnames(freq_table), pattern = '-', simplify = TRUE)[,2],
                       log10_n_cells = log10(colSums(freq_table)),
                       tumor_pheno = colnames(freq_table))

col_anno <- full_join(x = col_anno,
                       y = cellfrac_type_dict,
                       by = 'tumor_pheno',
                       suffix = c('', ''))

col_anno <- data.frame(histology = col_anno$histology,
                       cellfrac_type = col_anno$cellfrac_type,
                       log10_n_cells = col_anno$log10_n_cells,
                       row.names = col_anno$tumor_pheno)


cellfrac_cols <- scales::hue_pal()(3)
names(cellfrac_cols) <- c('SP', 'SR_IP', 'SR_IR')

anno_colors <- list(cellfrac_type = cellfrac_cols,
                   histology = c(SP = 'magenta', SR = 'cyan'))

pheatmap::pheatmap(prop.table(freq_table, margin = 2), 
                   display_numbers = freq_table,
                   main = 'column scaled (cell fraction) \n Number = n_cells per celltype per tumor',
                   annotation_col = col_anno,
                   annotation_colors = anno_colors)

Supplementary Figure 14

S14A - Top DEG dotplot

1920 x 720

markers <- read_csv('analysis_files/s3_cluster_l2_markers.csv')
## Rows: 43572 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gene
## dbl (6): p_val, avg_log2FC, pct.1, pct.2, p_val_adj, cluster
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
top_markers <- markers %>%
  filter(p_val_adj <= 0.01) %>%
  filter(pct.1 >= 0.5) %>%
  group_by(cluster) %>%
  arrange(desc(avg_log2FC)) %>%
  slice_head(n = 3)

DotPlot(so_merge,
        features = unique(top_markers$gene),
        cols = 'RdYlBu',
        group.by = 'celltype_full')+
  RotatedAxis()

S14B/C - NMF score vs lineage/cluster

lineage vs iNMF embedding: 1080 x 480

cluster vs iNMF embedding: 1080 x 600

# Average iNMF embeddings to show relationship to celltype_l1
cluster_embedding <- tibble(lineage = so_merge@meta.data$lineage) %>%
  cbind(so_merge@reductions$iNMF@cell.embeddings) %>%
  group_by(lineage) %>%
  summarize(across(everything(), list(mean)))

pheatmap::pheatmap(t(scale(t(data.frame(cluster_embedding[,2:ncol(cluster_embedding)],
                              row.names = cluster_embedding$lineage)))),
                   scale = 'none',
                   main = 'Average iNMF embedding per cluster \n Z-scored by row')

# Average iNMF embedding to show relationship to cluster, annotated by celltype_l1
cluster_embedding <- tibble(celltype_full = so_merge@meta.data$celltype_full) %>%
  cbind(so_merge@reductions$iNMF@cell.embeddings) %>%
  group_by(celltype_full) %>%
  summarize(across(everything(), list(mean)))

annotation_rows <- so_merge@meta.data %>%
  dplyr::select(lineage, celltype_full) %>%
  group_by_all() %>%
  distinct()

annotation_rows <- data.frame(celltype = annotation_rows$lineage,
                              row.names = annotation_rows$celltype_full)

pheatmap::pheatmap(t(scale(t(data.frame(cluster_embedding[,2:ncol(cluster_embedding)],
                              row.names = cluster_embedding$celltype_full)))),
                   scale = 'none',
                   main = 'Average iNMF embedding per cluster \n Z-scored by row',
                   annotation_row = annotation_rows)

S14D: Fraction of tumor by lineage (only lineages with >1 cluster)

so_meta <- so_merge@meta.data

subtype_counts <- so_meta %>%
  dplyr::select(tumor, cellfrac_type) %>%
  distinct() %>%
  group_by(cellfrac_type) %>%
  summarize(n_tumors = n())

so_meta_tumors <- so_meta %>%
  filter(lineage %in% c('epithelial', 'fibroblast', 'lymphoid', 'myeloid')) %>%
  dplyr::select(celltype_full, cellfrac_type, lineage, tumor) %>%
  group_by(tumor, cellfrac_type, celltype_full, lineage) %>%
  summarize(n_cells = n()) %>%
  group_by(tumor, lineage) %>%
  mutate(freq = n_cells/sum(n_cells))
## `summarise()` has grouped output by 'tumor', 'cellfrac_type', 'celltype_full'.
## You can override using the `.groups` argument.
so_meta_tumors_grouped <- full_join(x = so_meta_tumors,
                                    y = subtype_counts,
                                    by = 'cellfrac_type') %>%
  group_by(cellfrac_type, celltype_full) %>%
  mutate(mean_freq = mean(freq)) %>%
  mutate(freq_sem_ymin = mean(freq) - sd(freq)/(n_tumors^0.5)) %>%
  mutate(freq_sem_ymax = mean(freq) + sd(freq)/(n_tumors^0.5))

so_meta_tumors_summary <- so_meta_tumors %>%
  group_by(cellfrac_type, celltype_full) %>%
  summarize(tumor_count = n(),
            mean_freq = mean(freq),
            freq_sd_ymin = mean(freq) - sd(freq),
            freq_sd_ymax = mean(freq) + sd(freq),
            freq_sem_ymin = mean(freq) - sd(freq)/(tumor_count)^0.5,
            freq_sem_ymax = mean(freq) + sd(freq)/(tumor_count)^0.5,
            lineage = lineage)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## `summarise()` has grouped output by 'cellfrac_type', 'celltype_full'. You can
## override using the `.groups` argument.
# mean freq of tumor
ggplot(so_meta_tumors_summary, aes(y = mean_freq, x = celltype_full, fill = cellfrac_type))+
  geom_col(position = 'dodge')+
  geom_point(data = so_meta_tumors, aes(x = celltype_full, y = freq), color = 'gray50', size = 1.5)+
  theme_bw()+
  geom_errorbar(aes(x = celltype_full, ymin = freq_sem_ymin, ymax = freq_sem_ymax), position = 'dodge', width = 0.5)+
  facet_grid(rows = vars(cellfrac_type), cols = vars(lineage), scales = 'free_x', space = 'free_x')+
  theme(strip.text.x = element_text(angle = 90))+
  ylab('Mean frequency \n (fraction of tumor)')+
  xlab('Celltype \n (cluster-celltype)')+
  RotatedAxis()

S14D - Cluster distribution per tumor

1600 x 720

so_meta <- so_merge@meta.data

so_meta_tumors2 <- so_meta %>%
  dplyr::select(celltype_full, cellfrac_type, lineage, tumor) %>%
  group_by(tumor, cellfrac_type, celltype_full, lineage) %>%
  summarize(n_cells = n()) %>%
  group_by(tumor) %>%
  mutate(freq = n_cells/sum(n_cells))
## `summarise()` has grouped output by 'tumor', 'cellfrac_type', 'celltype_full'.
## You can override using the `.groups` argument.
# Order tumors by subtype
tumor_order <- so_meta_tumors2 %>%
  ungroup() %>%
  dplyr::select(tumor, cellfrac_type) %>%
  distinct() %>%
  arrange(cellfrac_type) %>%
  dplyr::select(tumor) %>%
  pull()

# Add tumor order as factor to tumor id & find frequency of cluster per tumor
so_meta_tumors2 <- so_meta_tumors2 %>%
  mutate(tumor = factor(tumor, levels = tumor_order)) %>%
  group_by(tumor) %>%
  mutate(freq = n_cells/sum(n_cells))


# Plot
ggplot(so_meta_tumors2, aes(x = freq, y = celltype_full, fill = cellfrac_type))+
  geom_col()+
  theme_bw()+
  facet_grid(lineage~tumor, scales = 'free_y', space = 'free')+
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.y = element_text(angle = 0))+
  theme(plot.margin = unit(c(1,1,1,1), 'cm'))+
  ylab('Cluster')+
  xlab('Frequency of cluster in tumor')

S14E: Epithelial biomarker expression

so_epi <- subset(so_merge,
                 subset = lineage == 'epithelial')

Idents(so_epi) <- 'celltype_full'

krts <- grep(rownames(so_epi),
             pattern = '^Krt',
             value = TRUE)

goi <- c(krts,
         'Epcam',
         'Vim',
         'Cd24',
         'Cd44')

goi_exp <- AverageExpression(so_epi,
                             features = goi)$RNA
## Warning: The following 1 features were not found in the RNA assay: Cd24
## Warning: The following 117 features were not found in the HTO assay:
## Krtap28-10, Krtap28-13, Krtcap2, Krtcap3, Krtdap, Krtap5-2, Krtap5-3, Krtap5-5,
## Krtap5-1, Krtap5-4, Krtap12-1, Krtap10-4, Krtap10-10, Krt222, Krt24, Krt25,
## Krt26, Krt27, Krt28, Krt10, Krt12, Krt20, Krt23, Krt39, Krt40, Krtap3-3,
## Krtap3-2, Krtap3-1, Krtap1-5, Krtap1-4, Krtap1-3, Krtap9-3, Krtap2-4, Krtap4-1,
## Krtap4-2, Krtap4-7, Krtap4-6, Krtap4-8, Krtap4-9, Krtap4-13, Krtap4-16,
## Krtap9-1, Krtap31-1, Krtap31-2, Krtap9-5, Krtap29-1, Krtap16-1, Krtap17-1,
## Krt33a, Krt33b, Krt34, Krt31, Krt32, Krt35, Krt36, Krt13, Krt15, Krt19, Krt9,
## Krt14, Krt16, Krt17, Krt42, Krt80, Krt7, Krt87, Krt88, Krt81, Krt86, Krt83,
## Krt84, Krt82, Krt90, Krt75, Krt6b, Krt6a, Krt5, Krt71, Krt72, Krt73, Krt2,
## Krt1, Krt77, Krt76, Krt4, Krt79, Krt78, Krt8, Krt18, Krtap24-1, Krtap26-1,
## Krtap27-1, Krtap13-1, Krtap13, Krtap14, Krtap15, Krtap19-1, Krtap19-2,
## Krtap19-3, Krtap19-4, Krtap19-5, Krtap19-9b, Krtap16-3, Krtap22-2, Krtap6-1,
## Krtap6-5, Krtap6-3, Krtap20-2, Krtap21-1, Krtap6-2, Krtap8-1, Krtap7-1,
## Krtap11-1, Epcam, Vim, Cd24, Cd44
## Warning: None of the features specified were found in the HTO assay.
goi_exp <- goi_exp[rowSums(goi_exp) > 1, ]

pheatmap::pheatmap(t(goi_exp),
                   color = colorRampPalette(c('purple', 'black', 'yellow'))(50),
                   main = 'Luminal-basal marker expression',
                   scale = 'column')

Supplemental Figure 15

S15A: UINMF jaccard index

560 x 860

top_n <- 50

n_shared_factors <- ncol(so_rliger@reductions$inmf@feature.loadings)


inmf_top_features <- list()

for(i in 1:n_shared_factors){
  curr_features <- so_rliger@reductions$inmf@feature.loadings[,i]
  
  top_features <- sort(curr_features, decreasing = TRUE)[1:top_n]
  
  inmf_top_features[[i]] <- names(top_features)
  
}


inmf_jaccard <- array(dim = c(n_shared_factors, n_shared_factors))
rownames(inmf_jaccard) <- colnames(inmf_jaccard) <- 1:n_shared_factors

for(i in 1:n_shared_factors){
  for(j in 1:n_shared_factors){
    inmf_jaccard[i,j] <- length(intersect(inmf_top_features[[i]], inmf_top_features[[j]]))/length(union(inmf_top_features[[i]], inmf_top_features[[j]]))
    
  }
}

pheatmap::pheatmap(inmf_jaccard,
                   color = colorRampPalette(colors = c('white', 'red'))(100),
                   breaks = seq(from = 0, to = 0.5, by = 0.005),
                   main = 'Jaccard similarity of UINMF shared-feature factor loadings',
                   cutree_rows = 7,
                   cutree_cols = 7)

S15B: Average UINMF embedding per lineage

lineage_embedding <- tibble(lineage = so_rliger@meta.data$celltype_l1.5) %>%
  cbind(so_rliger@reductions$inmf@cell.embeddings) %>%
  group_by(lineage) %>%
  summarize(across(everything(), list(mean)))

pheatmap::pheatmap(scale(t(data.frame(lineage_embedding[,2:ncol(lineage_embedding)],
                              row.names = lineage_embedding$lineage))),
                   scale = 'none',
                   main = 'Average UINMF embedding per lineage \n row z-scored',)

S15C: Clustering optimization

Figure produced in script S5_human_tnbc_integration_rliger

S15D: Fraction of lineage per cluster

1200 x 300

clust_lineage <- so_rliger@meta.data %>%
  dplyr::select(RNA_snn_res.0.6, celltype_l1.5) %>%
  table()

clust_lineage_prop <- prop.table(clust_lineage, margin = 1)

pheatmap::pheatmap(t(clust_lineage_prop),
                   display_numbers = round(t(clust_lineage_prop),2),
                   main = 'Fraction of lineage per cluster',
                   color = colorRampPalette(colors = c('white', 'black'))(100))

S15E: Sample composition vs unsupervised cluster

freqs <- so_rliger_meta %>%
  dplyr::select(sample_id, seurat_clusters) %>%
  table()

col_anno <- data.frame(row.names = cluster_lineage_dict$cluster,
                       lineage = cluster_lineage_dict$consensus_lineage)

row_anno <- so_rliger_meta %>%
  dplyr::select(sample_id, species, tumor_subtype) %>%
  distinct()

row_anno <- data.frame(row_anno[,-1],
                       row.names = row_anno$sample_id)

anno_colors <- list(lineage = lineage_cols)

col_order <- cluster_lineage_dict %>%
  arrange(consensus_lineage, cluster)

pheatmap::pheatmap(prop.table(freqs, margin = 1),
                   annotation_col = col_anno,
                   annotation_row = row_anno,
                   color = colorRampPalette(colors = c('white', 'black'))(100),
                   main = ('Sample composition \n Sample ID vs unsupervised cluster \n (Row Fraction)'))

sessionInfo()

sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] harmony_0.1.1         Rcpp_1.0.9            enrichplot_1.16.2    
##  [4] org.Mm.eg.db_3.15.0   AnnotationDbi_1.58.0  IRanges_2.32.0       
##  [7] S4Vectors_0.36.0      Biobase_2.58.0        BiocGenerics_0.44.0  
## [10] clusterProfiler_4.4.4 SeuratObject_4.1.3    Seurat_4.3.0.9001    
## [13] lubridate_1.9.2       forcats_1.0.0         stringr_1.5.0        
## [16] dplyr_1.1.0           purrr_1.0.1           readr_2.1.4          
## [19] tidyr_1.3.0           tibble_3.1.8          ggplot2_3.4.1        
## [22] tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2             spatstat.explore_3.0-6 reticulate_1.28       
##   [4] tidyselect_1.2.0       RSQLite_2.3.0          htmlwidgets_1.6.1     
##   [7] grid_4.2.2             BiocParallel_1.32.4    Rtsne_0.16            
##  [10] scatterpie_0.1.8       munsell_0.5.0          ragg_1.2.5            
##  [13] codetools_0.2-19       ica_1.0-3              future_1.32.0         
##  [16] miniUI_0.1.1.1         withr_2.5.0            spatstat.random_3.1-3 
##  [19] colorspace_2.0-3       GOSemSim_2.22.0        progressr_0.13.0      
##  [22] highr_0.10             knitr_1.42             rstudioapi_0.14       
##  [25] ROCR_1.0-11            tensor_1.5             DOSE_3.22.1           
##  [28] listenv_0.9.0          labeling_0.4.2         GenomeInfoDbData_1.2.9
##  [31] polyclip_1.10-4        pheatmap_1.0.12        bit64_4.0.5           
##  [34] farver_2.1.1           downloader_0.4         treeio_1.20.2         
##  [37] parallelly_1.34.0      vctrs_0.5.2            generics_0.1.3        
##  [40] xfun_0.37              timechange_0.1.1       R6_2.5.1              
##  [43] GenomeInfoDb_1.34.4    ggbeeswarm_0.7.1       graphlayouts_0.8.4    
##  [46] rmdformats_1.0.4       bitops_1.0-7           spatstat.utils_3.0-2  
##  [49] cachem_1.0.6           fgsea_1.22.0           gridGraphics_0.5-1    
##  [52] vroom_1.6.0            promises_1.2.0.1       scales_1.2.1          
##  [55] ggraph_2.1.0           beeswarm_0.4.0         gtable_0.3.1          
##  [58] globals_0.16.2         goftest_1.2-3          tidygraph_1.2.3       
##  [61] rlang_1.0.6            systemfonts_1.0.4      splines_4.2.2         
##  [64] lazyeval_0.2.2         spatstat.geom_3.0-6    yaml_2.3.6            
##  [67] reshape2_1.4.4         abind_1.4-5            httpuv_1.6.8          
##  [70] qvalue_2.28.0          tools_4.2.2            bookdown_0.33         
##  [73] ggplotify_0.1.0        ellipsis_0.3.2         jquerylib_0.1.4       
##  [76] RColorBrewer_1.1-3     ggridges_0.5.4         plyr_1.8.7            
##  [79] zlibbioc_1.44.0        RCurl_1.98-1.10        deldir_1.0-6          
##  [82] pbapply_1.7-0          viridis_0.6.2          cowplot_1.1.1         
##  [85] zoo_1.8-11             ggrepel_0.9.2          cluster_2.1.4         
##  [88] magrittr_2.0.3         data.table_1.14.6      scattermore_0.8       
##  [91] DO.db_2.9              lmtest_0.9-40          RANN_2.6.1            
##  [94] ggnewscale_0.4.8       fitdistrplus_1.1-8     matrixStats_0.63.0    
##  [97] hms_1.1.2              patchwork_1.1.2        mime_0.12             
## [100] evaluate_0.20          xtable_1.8-4           gridExtra_2.3         
## [103] compiler_4.2.2         shadowtext_0.1.2       KernSmooth_2.23-20    
## [106] crayon_1.5.2           htmltools_0.5.4        ggfun_0.0.9           
## [109] later_1.3.0            tzdb_0.3.0             aplot_0.1.10          
## [112] DBI_1.1.3              tweenr_2.0.2           MASS_7.3-58.1         
## [115] Matrix_1.5-1           cli_3.6.0              parallel_4.2.2        
## [118] igraph_1.3.5           pkgconfig_2.0.3        sp_1.6-0              
## [121] plotly_4.10.1          spatstat.sparse_3.0-0  ggtree_3.4.4          
## [124] vipor_0.4.5            bslib_0.4.2            XVector_0.38.0        
## [127] yulab.utils_0.0.6      digest_0.6.29          sctransform_0.3.5     
## [130] RcppAnnoy_0.0.20       spatstat.data_3.0-1    Biostrings_2.64.1     
## [133] rmarkdown_2.20         leiden_0.4.3           fastmatch_1.1-3       
## [136] tidytree_0.4.2         uwot_0.1.14            shiny_1.7.4           
## [139] aricode_1.0.1          lifecycle_1.0.3        nlme_3.1-160          
## [142] jsonlite_1.8.4         viridisLite_0.4.1      fansi_1.0.3           
## [145] pillar_1.8.1           lattice_0.20-45        ggrastr_1.0.1         
## [148] KEGGREST_1.36.3        fastmap_1.1.0          httr_1.4.5            
## [151] survival_3.4-0         GO.db_3.15.0           glue_1.6.2            
## [154] png_0.1-8              bit_4.0.5              ggforce_0.4.1         
## [157] stringi_1.7.8          sass_0.4.5             blob_1.2.3            
## [160] textshaping_0.3.6      memoise_2.0.1          ape_5.7               
## [163] irlba_2.3.5.1          future.apply_1.10.0